library(ggplot2)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(ggExtra)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(viridis)
## Loading required package: viridisLite
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lattice)
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(caret)
library(gains)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data = read.csv("E:/Users/ASUS/Documents/term8/ravesh tahghigh/clean data.csv",header = TRUE)
attach(data)
subj = as.factor(subj)
trial = as.numeric(trial)
s = as.factor(S)
r = as.factor(R)
rt = as.numeric(RT)
correct = ifelse(correct=="TRUE",1,0)
correct = as.factor(correct)
instruction = as.factor(instruction)
new.data = data.frame(subj,trial,s,r,rt,correct,instruction)
#EDA PART
#histogram of response time when instruction = accuracy
accuracy.rt = subset(new.data, instruction == "accuracy")$rt
hist(accuracy.rt, col = "pink",
main = "histogram of response time when instruction = accuracy",
xlab = "response time", ylab = "frequency")
#correct vs subject
correct.subj = table(correct,subj)
barplot(correct.subj,
legend.text = TRUE,
beside = TRUE,
main = "number of right and wrong choices for each person",
xlab = "",
ylab = "",col=c("darkorchid4","darkorchid1"),
horiz=T, las=1)
#instruction vs correct
correct.instruction = table(correct,instruction)
correct.instruction.bar = barplot(correct.instruction,
legend.text = TRUE,
beside = TRUE,
main = "number of right and wrong choices for each instruction",
xlab = "type of instruction",
ylab = "",col=c("aliceblue","cadetblue1"),
horiz=F, las=1)
#response time vs correct
true.rt = subset(new.data, correct == "1")$rt
false.rt = subset(new.data, correct == "0")$rt
hist(true.rt, breaks=30, col=rgb(1,0,0,0.5), xlab="response time",
ylab="frequency", main="distribution of response time for each choice" )
hist(false.rt, breaks=30, col=rgb(0,0,1,0.5), add=T)
legend("topright", legend=c("TRUE","FALSE"), col=c(rgb(1,0,0,0.5),
rgb(0,0,1,0.5)), pt.cex=2, pch=15)
#response time vs instruction
instruction.rt = table(rt,instruction)
boxplot(rt ~ instruction , col=terrain.colors(4),
main = "distribution of response time for each instruction type",
xlab = "instruction",
ylab = "response time")
#response time vs instruction
ggplot(data=new.data, aes(x=rt, group=instruction, fill=instruction)) +
geom_density(adjust=1.5, alpha=.4) +
theme_ipsum() +
ylab("") +
xlab("response time") +
labs(title = "distribution of response time for each type of instruction")
#response time vs instruction vs correct
mean.rt <- aggregate(rt ~ instruction + correct, data = new.data, mean)
mean.rt=data.frame(mean.rt)
mean.rt <- mean.rt %>%
mutate(text = paste0("response time mean: ",rt))
p <- ggplot(mean.rt, aes(instruction, correct, fill= rt, text=text)) +
geom_tile() +
theme_ipsum()
ggplotly(p, tooltip="text")
#test
chisq.test(subj,correct)
##
## Pearson's Chi-squared test
##
## data: subj and correct
## X-squared = 971.2, df = 18, p-value < 2.2e-16
chisq.test(subj,instruction)
##
## Pearson's Chi-squared test
##
## data: subj and instruction
## X-squared = 56.074, df = 36, p-value = 0.01763
chisq.test(instruction,correct)
##
## Pearson's Chi-squared test
##
## data: instruction and correct
## X-squared = 219.72, df = 2, p-value < 2.2e-16
set.seed(42)
train_rows=sample(nrow(new.data),9500)
train_data=new.data[train_rows,]
table(train_data$correct)
##
## 0 1
## 1510 7990
validation_rows=sample(setdiff(row.names(new.data),train_rows),4000)
validation_data=new.data[validation_rows,]
table(validation_data$correct)
##
## 0 1
## 663 3337
test_rows=setdiff(row.names(new.data),c(train_rows,validation_rows))
test_data=new.data[test_rows,]
table(test_data$correct)
##
## 0 1
## 351 1967
model=glm(correct~subj+rt+instruction,data=train_data,family="binomial")
predict.model.validation=predict(model,validation_data,type="response")
confusionMatrix(
data=as.factor(ifelse(predict.model.validation>0.5,1,0)),
reference=validation_data$correct)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8 9
## 1 655 3328
##
## Accuracy : 0.834
## 95% CI : (0.8221, 0.8454)
## No Information Rate : 0.8342
## P-Value [Acc > NIR] : 0.5273
##
## Kappa : 0.0154
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.01207
## Specificity : 0.99730
## Pos Pred Value : 0.47059
## Neg Pred Value : 0.83555
## Prevalence : 0.16575
## Detection Rate : 0.00200
## Detection Prevalence : 0.00425
## Balanced Accuracy : 0.50468
##
## 'Positive' Class : 0
##
#over fitting check
predict.model.train=predict(model,train_data,type="response")
confusionMatrix(
data=as.factor(ifelse(predict.model.train>0.5,1,0)),
reference=train_data$correct)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 22 22
## 1 1488 7968
##
## Accuracy : 0.8411
## 95% CI : (0.8335, 0.8484)
## No Information Rate : 0.8411
## P-Value [Acc > NIR] : 0.5069
##
## Kappa : 0.0195
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.014570
## Specificity : 0.997247
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.842640
## Prevalence : 0.158947
## Detection Rate : 0.002316
## Detection Prevalence : 0.004632
## Balanced Accuracy : 0.505908
##
## 'Positive' Class : 0
##
#step methods
#backward
model.back=model %>% stats::step(direction = "backward")
## Start: AIC=7567.55
## correct ~ subj + rt + instruction
##
## Df Deviance AIC
## <none> 7523.6 7567.6
## - rt 1 7546.5 7588.5
## - instruction 2 7617.8 7657.8
## - subj 18 8183.0 8191.0
model.back.pred=predict(model.back,validation_data,type="response")
confusionMatrix(
data=as.factor(ifelse(model.back.pred>0.5,1,0)),
reference=validation_data$correct)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8 9
## 1 655 3328
##
## Accuracy : 0.834
## 95% CI : (0.8221, 0.8454)
## No Information Rate : 0.8342
## P-Value [Acc > NIR] : 0.5273
##
## Kappa : 0.0154
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.01207
## Specificity : 0.99730
## Pos Pred Value : 0.47059
## Neg Pred Value : 0.83555
## Prevalence : 0.16575
## Detection Rate : 0.00200
## Detection Prevalence : 0.00425
## Balanced Accuracy : 0.50468
##
## 'Positive' Class : 0
##
#forward
model.for=model %>% stats::step(direction = "forward")
## Start: AIC=7567.55
## correct ~ subj + rt + instruction
model.for.pred=predict(model.for,validation_data,type="response")
confusionMatrix(
data=as.factor(ifelse(model.for.pred>0.5,1,0)),
reference=validation_data$correct)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8 9
## 1 655 3328
##
## Accuracy : 0.834
## 95% CI : (0.8221, 0.8454)
## No Information Rate : 0.8342
## P-Value [Acc > NIR] : 0.5273
##
## Kappa : 0.0154
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.01207
## Specificity : 0.99730
## Pos Pred Value : 0.47059
## Neg Pred Value : 0.83555
## Prevalence : 0.16575
## Detection Rate : 0.00200
## Detection Prevalence : 0.00425
## Balanced Accuracy : 0.50468
##
## 'Positive' Class : 0
##
#both
model.both=model %>% stats::step(direction = "both")
## Start: AIC=7567.55
## correct ~ subj + rt + instruction
##
## Df Deviance AIC
## <none> 7523.6 7567.6
## - rt 1 7546.5 7588.5
## - instruction 2 7617.8 7657.8
## - subj 18 8183.0 8191.0
model.both.pred=predict(model.both,validation_data,type="response")
confusionMatrix(
data=as.factor(ifelse(model.both.pred>0.5,1,0)),
reference=validation_data$correct)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8 9
## 1 655 3328
##
## Accuracy : 0.834
## 95% CI : (0.8221, 0.8454)
## No Information Rate : 0.8342
## P-Value [Acc > NIR] : 0.5273
##
## Kappa : 0.0154
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.01207
## Specificity : 0.99730
## Pos Pred Value : 0.47059
## Neg Pred Value : 0.83555
## Prevalence : 0.16575
## Detection Rate : 0.00200
## Detection Prevalence : 0.00425
## Balanced Accuracy : 0.50468
##
## 'Positive' Class : 0
##
#lift chart
logit.reg.lift=lift(relevel(as.factor(correct),ref="1")~
predict.model.validation,data=validation_data)
xyplot(logit.reg.lift,plot="gain")
#decile lift chart
gain=gains(as.numeric(validation_data$correct),predict.model.validation)
heights=gain$mean.resp/mean(as.numeric(validation_data$correct))
decile.chart=barplot(heights,names.arg=gain$depth,
xlab="percentile",ylab="mean response")
#roc plot
r=roc(validation_data$correct,predict.model.validation)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(r)
auc(r)
## Area under the curve: 0.7101
gamma.model = glm(rt~correct+instruction,data=train_data,
family = Gamma(link = "identity"))
predict.gamma.model.validation=predict(gamma.model,
validation_data,type="response")
confusionMatrix(
data=as.factor(ifelse(predict.gamma.model.validation>0.5,1,0)),
reference=as.factor(ifelse(validation_data$rt>0.5,1,0)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1941 684
## 1 764 611
##
## Accuracy : 0.638
## 95% CI : (0.6229, 0.6529)
## No Information Rate : 0.6762
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : 0.1864
##
## Mcnemar's Test P-Value : 0.03789
##
## Sensitivity : 0.7176
## Specificity : 0.4718
## Pos Pred Value : 0.7394
## Neg Pred Value : 0.4444
## Prevalence : 0.6763
## Detection Rate : 0.4853
## Detection Prevalence : 0.6562
## Balanced Accuracy : 0.5947
##
## 'Positive' Class : 0
##
#over fitting check
predict.gamma.model.train=predict(gamma.model,train_data,type="response")
confusionMatrix(
data=as.factor(ifelse(predict.gamma.model.train>0.5,1,0)),
reference=as.factor(ifelse(train_data$rt>0.5,1,0)))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4766 1576
## 1 1749 1409
##
## Accuracy : 0.65
## 95% CI : (0.6403, 0.6596)
## No Information Rate : 0.6858
## P-Value [Acc > NIR] : 1.000000
##
## Kappa : 0.2004
##
## Mcnemar's Test P-Value : 0.002856
##
## Sensitivity : 0.7315
## Specificity : 0.4720
## Pos Pred Value : 0.7515
## Neg Pred Value : 0.4462
## Prevalence : 0.6858
## Detection Rate : 0.5017
## Detection Prevalence : 0.6676
## Balanced Accuracy : 0.6018
##
## 'Positive' Class : 0
##
#visualization of gamma model
# shape: 1 divided by dispersion parameter
m0_shape <- 1/0.08938668
# scale: mean/shape
m0_scale <- sum(coef(gamma.model))/m0_shape
hist(rt, breaks = 40, freq = FALSE)
curve(dgamma(x, shape = m0_shape, scale = m0_scale),
from = 0.2, to = 1.4, col = "red", add = TRUE)
#compare simulation of this gamma model with real data
sims1 <- simulate(gamma.model, nsim = 1000)
plot(density(rt),
main = "Simulated data for gamma model")
for(i in 1:50)lines(density(sims1[[i]]), col="green")